library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(phyloseq)
library(microbiome)
##
## microbiome R package (microbiome.github.com)
##
##
##
## Copyright (C) 2011-2022 Leo Lahti,
## Sudarshan Shetty et al. <microbiome.github.io>
##
##
## Attaching package: 'microbiome'
##
## The following object is masked from 'package:ggplot2':
##
## alpha
##
## The following object is masked from 'package:base':
##
## transform
#input = dataframe of features x labels (including sample names)
# List of foods in order of model importance
plot_food_direction = function(food_ranking, data, variable) {
for(food in food_ranking) {
cols = colnames(data)
if(food %in% cols){
plot = data %>%
select(variable, food) %>%
ggplot()+
geom_boxplot(aes_string(x= variable, y= food, colour = variable))+
geom_jitter(aes_string(x= variable, y= food, colour = variable), height=0)+
theme_classic()
print (plot)
}
}
}
aha_pred = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/aha_to_choice_predictions.csv")
## Rows: 3420 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sample, pred, id, treatment, true_week, timepoint
## dbl (4): Case, Control, auc, iteration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aha_importance = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/aha_to_choice_feature_importance.csv")
## Rows: 20 Columns: 191
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): label
## dbl (190): ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pomms_pred = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/pomms_to_choice_predictions.csv")
## Rows: 3420 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): sample, pred, id, treatment, true_week, timepoint
## dbl (4): Case, Control, auc, iteration
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pomms_importance = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/Machine_learning/LAD_lab_ML/code/pomms_to_choice_feature_importance.csv")
## Rows: 20 Columns: 226
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): label
## dbl (225): ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG, ATCCTTATTTTGA...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aha_pred %>%
select(auc) %>%
distinct() %>%
ggplot()+
geom_boxplot(aes(y= auc, x= "AHA"))+
geom_point(aes(y= auc, x= "AHA"), alpha = 0.5)+
labs(title = "AHA AUC range")+
theme_classic()
pomms_pred %>%
select(auc) %>%
distinct() %>%
ggplot()+
geom_boxplot(aes(y= auc, x= "POMMS"))+
geom_point(aes(y= auc, x= "POMMS"), alpha = 0.5)+
labs(title = "POMMS AUC range")+
theme_classic()
max = aha_pred$auc %>%
max()
aha_pred %>%
filter(auc == max) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="AHA to CHOICE")+
theme_classic()
aha_pred %>%
filter(auc == 0.94) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="AHA to CHOICE")+
theme_classic()
max = pomms_pred$auc %>%
max()
pomms_pred %>%
filter(auc == max) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="POMMS to CHOICE")+
theme_classic()
pomms_pred %>%
filter(auc == 0.82) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="POMMS to CHOICE")+
theme_classic()
#AUCs above 0.7
aha_pred %>%
filter(auc > 0.7) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="AHA to CHOICE")+
theme_classic()
aha_pred %>%
filter(auc > 0.7) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="AHA to CHOICE")+
theme_classic()
pomms_pred %>%
filter(auc > 0.7) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Case, colour = timepoint))+
geom_point(aes(x= treatment, y= Case, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="POMMS to CHOICE")+
theme_classic()
pomms_pred %>%
filter(auc > 0.7) %>%
ggplot()+
geom_boxplot(aes(x= treatment, y= Control, colour = timepoint))+
geom_point(aes(x= treatment, y= Control, colour = timepoint), position=position_jitterdodge(), alpha = 0.2)+
labs(title ="POMMS to CHOICE")+
theme_classic()
# combined = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/CHOICE/All_cohorts/data_objects/CHOICE_POMMS_AHA_combined_20221213.rds")
#
# lookup = combined %>%
# tax_table() %>%
# as.data.frame() %>%
# as_tibble(rownames = NA) %>%
# rownames_to_column("asv")
common_names = read_csv("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Brianna P/diet/food-dbs/data/outputs/dada2-compatible/trnL/trnLGH ASV common names.csv")
## Rows: 405 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): asv, name, taxon, common_name
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
c_ps = readRDS("/Users/aa370/Library/CloudStorage/Box-Box/project_davidlab/LAD_LAB_Personnel/Ammara_A/Projects/CHOICE/CHOICE_Trnl/CHOICE_20220912/ps_objects/choice_complete_20221205_clean.rds")
otu_clr = abundances(c_ps, "clr") %>% # clr transform
t()
c_ps = phyloseq(otu_table(otu_clr, taxa_are_rows = FALSE),
sample_data(sample_data(c_ps)),
tax_table(tax_table(c_ps)))
samdf = c_ps %>%
sample_data() %>%
as.data.frame() %>%
as_tibble(rownames = NA) %>%
rownames_to_column("sample") %>%
select(sample, id, treatment, true_week, timepoint)
otudf = c_ps %>%
otu_table() %>%
as.data.frame() %>%
as_tibble(rownames = NA) %>%
rownames_to_column("sample")
input = samdf %>%
left_join(otudf)
## Joining, by = "sample"
max = aha_pred$auc %>%
max()
#colnames(aha_importance)
importance_summary = aha_importance %>%
filter(auc == max) %>%
pivot_longer(cols = c("ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG":"CATAAACTGGTAGCGACCGGCACCACCGGTAAACTGATCGCAGAAGCTACCGGCT"), names_to = "food", values_to = "importance") %>%
group_by(label, food) %>%
summarise(mean_importance = mean(importance)) %>%
arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
pull(food)
top10 = ranking[1:10]
plot_food_direction(top10, input, "treatment")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(variable)
##
## # Now:
## data %>% select(all_of(variable))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(food)
##
## # Now:
## data %>% select(all_of(food))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation ideoms with `aes()`
name_list = c()
for(item in top10){
name = common_names %>%
filter(str_detect(asv, item)) %>%
pull(common_name)
name_list = append(name_list, name)
}
name_list
## [1] "cocoa bean"
## [2] "avocado, NA, NA, NA, NA, NA"
## [3] "NA, ceylon cinnamon, bay leaf, avocado, NA, NA"
## [4] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"
## [5] "garlic"
## [6] "muscadine grape, common grape vine, NA, NA"
## [7] "common grape vine"
## [8] "common grape vine"
## [9] "black chokeberry, apple, NA, crab apple, european pear, NA, afghan pear, nashi pear, siberian pear, hybrid chinese white pear"
## [10] "NA, pecan"
## [11] "sesame"
max = pomms_pred$auc %>%
max()
#colnames(pomms_importance)
importance_summary = pomms_importance %>%
filter(auc == max) %>%
pivot_longer(cols = c("ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG":"ATCCTGTTTTCTCAAAACAAACAAAGGTTCAGAAAAAAAG"), names_to = "food", values_to = "importance") %>%
group_by(label, food) %>%
summarise(mean_importance = mean(importance)) %>%
arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
pull(food)
top10 = ranking[1:10]
plot_food_direction(top10, input, "treatment")
name_list = c()
for(item in top10){
name = common_names %>%
filter(str_detect(asv, item)) %>%
pull(common_name)
name_list = append(name_list, name)
}
name_list
## [1] "dandelion"
## [2] "sunflower, jerusalem artichoke, sunchoke, dandelion"
## [3] "ashanti pepper, long pepper, black pepper, wild betel"
## [4] "canola, rapeseed"
## [5] "brown mustard, canola, rapeseed, black mustard, cabbage, broccoli, cauliflower, kale, Brussels sprouts, collard greens, savoy, kohlrabi, gai lan, mustards, bok choy, canola, rapeseed, field mustard, napa cabbage, turnip, rocket, NA, white mustard"
## [6] "oregano, NA, thyme"
## [7] "sage"
## [8] "sage"
## [9] "NA, NA, rosemary"
## [10] "lemon balm, rosemary"
## [11] "sesame"
## [12] NA
## [13] NA
## [14] "NA, NA"
## [15] NA
## [16] "rye, NA, NA, NA, NA, NA, bread wheat, NA, NA, NA, emmer wheat, spelt, einkorn, wild einkorn, spelt, NA, domesticated hulled wheat, NA, rivet wheat, durum wheat, wild einkorn, NA"
## [17] "coconut"
## [18] "fennel, dill, angelica, cumin, wild carrot, parsnip, parsley"
## [19] "lettuce"
## [20] "flaxseed"
## [21] "tomato, NA"
#colnames(aha_importance)
importance_summary = aha_importance %>%
filter(auc >0.7) %>%
pivot_longer(cols = c("ATCCTATTATTTTATTATTTTACGAAACTAAACAAAGGTTCAGCAAGCGAGAATAATAAAAAAAG":"CATAAACTGGTAGCGACCGGCACCACCGGTAAACTGATCGCAGAAGCTACCGGCT"), names_to = "food", values_to = "importance") %>%
group_by(label, food) %>%
summarise(mean_importance = mean(importance)) %>%
arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
pull(food)
top10 = ranking[1:10]
plot_food_direction(top10, input, "treatment")
name_list = c()
for(item in top10){
name = common_names %>%
filter(str_detect(asv, item)) %>%
pull(common_name)
name_list = append(name_list, name)
}
name_list
## [1] "cocoa bean"
## [2] "avocado, NA, NA, NA, NA, NA"
## [3] "NA, ceylon cinnamon, bay leaf, avocado, NA, NA"
## [4] "garlic"
## [5] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"
## [6] "muscadine grape, common grape vine, NA, NA"
## [7] "common grape vine"
## [8] "common grape vine"
## [9] "sesame"
## [10] "NA, pecan"
## [11] "black chokeberry, apple, NA, crab apple, european pear, NA, afghan pear, nashi pear, siberian pear, hybrid chinese white pear"
importance_summary = pomms_importance %>%
filter(auc >0.7) %>%
pivot_longer(cols = c("ATCACGTTTTCCGAAAACAAACAAAGGTTCAGAAAGCGAAAATAAAAAAG":"ATCCTGTTTTCTCAAAACAAACAAAGGTTCAGAAAAAAAG"), names_to = "food", values_to = "importance") %>%
group_by(label, food) %>%
summarise(mean_importance = mean(importance)) %>%
arrange(label, desc(mean_importance))
## `summarise()` has grouped output by 'label'. You can override using the
## `.groups` argument.
ranking = importance_summary %>%
pull(food)
top10 = ranking[1:10]
plot_food_direction(top10, input, "treatment")
name_list = c()
for(item in top10){
name = common_names %>%
filter(str_detect(asv, item)) %>%
pull(common_name)
name_list = append(name_list, name)
}
name_list
## [1] "dandelion"
## [2] "sunflower, jerusalem artichoke, sunchoke, dandelion"
## [3] "tea"
## [4] "ashanti pepper, long pepper, black pepper, wild betel"
## [5] "canola, rapeseed"
## [6] "brown mustard, canola, rapeseed, black mustard, cabbage, broccoli, cauliflower, kale, Brussels sprouts, collard greens, savoy, kohlrabi, gai lan, mustards, bok choy, canola, rapeseed, field mustard, napa cabbage, turnip, rocket, NA, white mustard"
## [7] "rye, NA, NA, NA, NA, NA, bread wheat, NA, NA, NA, emmer wheat, spelt, einkorn, wild einkorn, spelt, NA, domesticated hulled wheat, NA, rivet wheat, durum wheat, wild einkorn, NA"
## [8] "fennel, dill, angelica, cumin, wild carrot, parsnip, parsley"
## [9] "goji berry, NA, goji berry, cutleaf groundcherry, tomatillo, nightshade, NA, blackberry nightshade, potato, NA"
## [10] "coconut"
## [11] "lettuce"
## [12] "oat"